We want to look at real data to simulate realistic datasets.
load("Expt4c2b_filtdata.Rda")
load("E4c2b_slingshot_wsforkelly.RData")
# select only 4 clusters : pink, cyan, orange, darkblue
select = c(3, 4, 11, 13)
cc_rev = cc_rev[select]
clus.labels = clus.labels[clus.labels %in% c(3, 4, 10, 12)]
clus.labels = droplevels(clus.labels)
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix( ~ qcpca$x[, 1:5])
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000], ]
detection_rate <- colSums(core > 0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]
dim(core)
## [1] 1000 157
We select only the cells that pass QC, color coded by the experimental time point. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide and Fanny computers.
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
pal =colorRampPalette(c("black",'black',"red","yellow"), space="rgb")
par(mfrow = c(1,2))
plot(mean, rowMeans(core == 0), xlim = c(1,11), ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
main = '1,000 most variable genes')
smoothScatter(mean, rowMeans(core == 0),xlim = c(1,11),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = '1,000 most variable genes')
Fit data with K = 4, V intercept in both Mu and Pi, commondispersion = FALSE, and X accounting for qc. Notice the warnings.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 4, X = mod,
commondispersion = FALSE)))
## Warning in simpute.als(x, J, thresh, lambda, maxit, trace.it, warm.start, :
## Convergence not achieved by 100 iterations
## user system elapsed
## 527.478 207.518 293.303
If we simulate W from real data, it will look like that. I used K = 4, maybe K = 3 would be a better choice to simulate data.
pairs(zinb@W, col = colMerged, pch = 19, main = "ZINB")
gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]
df <- data.frame(gamma_mu = gamma_mu,
gamma_pi = gamma_pi,
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.3384076 0.5949632 0.7542561
## gamma_pi -0.3384076 1.0000000 -0.8733322 -0.1700023
## detection_rate 0.5949632 -0.8733322 1.0000000 0.3309894
## coverage 0.7542561 -0.1700023 0.3309894 1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = colMerged)
gamma = melt(gamma)
ggplot(gamma, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable) + xlab('gamma')
ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
facet_grid(~ variable) + xlab('gamma') + theme_bw()
Note that we accounting for batch and qc in X, so beta has more than one row (M = 25 rows). Here, we look only at the first row of beta.
beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]
df <- data.frame(beta_mu = beta_mu,
beta_pi = beta_pi)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.00000000 -0.07272988
## beta_pi -0.07272988 1.00000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable, scales = 'free') + xlab('beta')
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot(outlier.shape = NA) +
facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta removing outliers') +
scale_x_discrete(breaks = c('', ''), drop=FALSE) +
scale_y_continuous(limits = c(min, max))
## Warning: Removed 1247 rows containing non-finite values (stat_boxplot).
pairs(t(zinb@alpha_mu), main = 'alpha_mu')
pairs(t(zinb@alpha_pi), main = 'alpha_pi')
df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
alpha_mu_2 = zinb@alpha_mu[2, ],
alpha_mu_3 = zinb@alpha_mu[3, ],
alpha_mu_4 = zinb@alpha_mu[4, ],
alpha_pi_1 = zinb@alpha_pi[1, ],
alpha_pi_2 = zinb@alpha_pi[2, ],
alpha_pi_3 = zinb@alpha_pi[3, ],
alpha_pi_4 = zinb@alpha_pi[4, ])
pairs(df, pch=19)
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_mu_3 alpha_mu_4 alpha_pi_1
## alpha_mu_1 1.000000000 -0.01821755 -0.10282729 -0.02862087 0.199990196
## alpha_mu_2 -0.018217554 1.00000000 0.25672718 0.12787712 0.115313683
## alpha_mu_3 -0.102827287 0.25672718 1.00000000 -0.15494610 0.026598495
## alpha_mu_4 -0.028620869 0.12787712 -0.15494610 1.00000000 -0.030037446
## alpha_pi_1 0.199990196 0.11531368 0.02659849 -0.03003745 1.000000000
## alpha_pi_2 0.252274656 -0.35305144 0.01979558 -0.08733802 0.053448173
## alpha_pi_3 -0.050755119 -0.09634999 -0.32138195 0.10866043 0.008684217
## alpha_pi_4 0.009658894 -0.18435567 0.10700202 -0.03672412 0.027262323
## alpha_pi_2 alpha_pi_3 alpha_pi_4
## alpha_mu_1 0.25227466 -0.050755119 0.009658894
## alpha_mu_2 -0.35305144 -0.096349992 -0.184355668
## alpha_mu_3 0.01979558 -0.321381945 0.107002019
## alpha_mu_4 -0.08733802 0.108660433 -0.036724117
## alpha_pi_1 0.05344817 0.008684217 0.027262323
## alpha_pi_2 1.00000000 -0.076684829 0.171043275
## alpha_pi_3 -0.07668483 1.000000000 -0.047410883
## alpha_pi_4 0.17104328 -0.047410883 1.000000000
Dispersion was estimated on FQ norm counts using function estimateDisp from edgeR. Dispersions estimated using edgeR and ZINB does not seem to be on the same scale. I think it is because I estimate dispersion on (normalized) counts with edgeR, but (if I understood well) zinb estimate the dispersion on log1p(counts). We should rely on values of dispersion from edgeR as we are going to simulates the counts with \[Y_{i,j} \sim NB(\mu_{i,j}, \phi_j)\]
par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
xlab = 'edgeR tagwise dispersion', main = 'Dispersion')
print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.137023
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')
par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=colMerged, ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=colMerged)
par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(2,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'True')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(2,8),
xlab = "Estimated Mean Expression", main = 'Estimated',
ylab = "Estimated Dropout Rate",ylim = c(0,1),
colramp = pal)
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
xlab = "Estimated Mean Expression", xlim = c(2,8),
ylab = "Estimated Dropout Rate", pch=19,
ylim = c(0, 1), main = 'Estimated')